In [106]:
# for data manipulation
import numpy as np
import pandas as pd
# for MongoDB connection
import pymongo
import matplotlib as plt
# for statistical hypothesis testing
import scipy.stats
%matplotlib inline

In [92]:
# for interactive plotting
import plotly.plotly as py
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.set_config_file(offline=True, theme='ggplot')
print __version__ # requires version >= 1.9.0


2.0.1

In [2]:
def read_mongo(collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """
    # Connect to MongoDB and Make a query to the specific DB and Collection
    with pymongo.MongoClient(host, port) as client:
        table = client.appstore[collection]
        df = pd.DataFrame(list(table.find(query)))
        
    # Delete the _id
    if no_id:
        del df['_id']

    return df

Let's read all the data first.


In [3]:
apps_df = read_mongo('appitems')

In [4]:
apps_df.shape #5658 unique apps


Out[4]:
(5658, 26)

In [10]:
apps_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5658 entries, 0 to 5657
Data columns (total 26 columns):
category              5658 non-null object
current_rating        4522 non-null float64
description           5658 non-null object
id                    5658 non-null object
is_InAppPurcased      5658 non-null int64
is_multilingual       5658 non-null int64
is_multiplatform      5658 non-null int64
name                  5658 non-null object
new_version_desc      5658 non-null object
num_current_rating    4522 non-null float64
num_overall_rating    4450 non-null float64
overall_rating        4450 non-null float64
price                 5658 non-null object
publish_date          5658 non-null object
review1               4982 non-null object
review1_star          4982 non-null float64
review2               4648 non-null object
review2_star          4648 non-null float64
review3               4388 non-null object
review3_star          4388 non-null float64
scrape_date           5658 non-null object
seller                4881 non-null object
size                  5658 non-null object
update_date           5658 non-null object
url                   5658 non-null object
version               5658 non-null object
dtypes: float64(7), int64(3), object(16)
memory usage: 1.1+ MB

We can see there are some missing values in current ratings and overall ratings. Since the overall rating is actually nomial variable, we can make it numeric for the convinience of visualization.


In [5]:
rating_cleaned = {'1 star':1, "1 and a half stars": 1.5, '2 stars': 2, '2 and a half stars':2.5, "3 stars":3, "3 and a half stars":3.5, "4 stars": 4,
                 '4 and a half stars': 4.5, "5 stars": 5}

In [6]:
apps_df.overall_rating = apps_df.overall_rating.replace(rating_cleaned)
#apps_df.to_pickle('app_cleaned.pickle')

Let's check out the distribution of apps in terms of category and rating.


In [56]:
cate_cnt = apps_df.groupby(['category', 'overall_rating'])['id'].count().reset_index()
rate_cate_cnt = cate_cnt.pivot_table(index = 'category', columns = 'overall_rating', values = 'id', fill_value= 0)

In [74]:
rate_cate_cnt.iplot(kind = 'bar', barmode = 'stack', yTitle='Number of Apps', title='Distribution of Apps by Category and Rating', 
                    colorscale = 'Paired', theme='white', labels = 'Rating')


We also want to check the relationship between current rating and overall ratings.


In [91]:
rating_df = apps_df[["name","overall_rating", "current_rating", 'num_current_rating', "num_overall_rating"]].dropna()

In [90]:
rating_df.iplot(kind = "bubble", x = "overall_rating", y = "current_rating", size = 'num_current_rating', text = 'name',
              xTitle='Overall Rating', yTitle='Current Rating')



In [95]:
rating_df.iplot(kind = "scatter", mode = "markers", x = "current_rating", y = "num_current_rating", text = "name", size = 5, xTitle = "Current Rating",
               yTitle = "Num of Current Rating")



In [105]:
# py.iplot(
#     {
#         'data': [
#             {
#                 'x': df[df['year']==year]['gdpPercap'],
#                 'y': df[df['year']==year]['lifeExp'],
#                 'name': year, 'mode': 'markers',
#             } for year in [1952, 1982, 2007]
#         ],
#         'layout': {
#             'xaxis': {'title': 'GDP per Capita', 'type': 'log'},
#             'yaxis': {'title': "Life Expectancy"}
#         }
# }, )

We want to define a more balanced metric to evaluate the quality of an app to facilitate our further analysis.

$$ \frac{number\;of\;current\;rating}{number\;of\;overall\;rating}* current\;rating + (1-\frac{number\;of\;current\;rating}{number\;of\;overall\;rating})* overall\;rating $$

In [96]:
rating_df['weighted_rating'] = map(lambda a, b, c,d: np.divide(a,b)*c+(1-np.divide(a,b))*d, rating_df['num_current_rating'], 
                                   rating_df['num_overall_rating'], rating_df['current_rating'], rating_df['overall_rating'])

In [104]:
rating_df[['weighted_rating', 'current_rating','overall_rating']].iplot(kind='histogram', barmode='stack', theme='white', title = 'Distribution of Rating Metrics')


We are interested in the question that whether in-app purchases significantly affect the apps' ratings, especially for gaming apps? We can answer this question by doing hypothesis testing. Since the distribution of ratings are not normal, obviously we can’t use t-test or one-way ANOVA test. Thus, we prefer to use Kruskal-Wallis H-test, a non-parametric test which only requires the independence assumption.


In [113]:
free_df = apps_df[(apps_df['is_InAppPurcased'] == 0)&(pd.notnull(apps_df['overall_rating']))][["name","overall_rating", "current_rating", 'num_current_rating', "num_overall_rating"]]

In [115]:
paid_df = apps_df[(apps_df['is_InAppPurcased'] == 1)&(pd.notnull(apps_df['overall_rating']))][["name","overall_rating", "current_rating", 'num_current_rating', "num_overall_rating"]]

In [116]:
free_df['weighted_rating'] = map(lambda a, b, c,d: np.divide(a,b)*c+(1-np.divide(a,b))*d, free_df['num_current_rating'], 
                                   free_df['num_overall_rating'], free_df['current_rating'], free_df['overall_rating'])
paid_df['weighted_rating'] = map(lambda a, b, c,d: np.divide(a,b)*c+(1-np.divide(a,b))*d, paid_df['num_current_rating'], 
                                   paid_df['num_overall_rating'], paid_df['current_rating'], paid_df['overall_rating'])

In [117]:
free = list(free_df['weighted_rating'])
paid = list(paid_df['weighted_rating'])

$H_0:$ The medians of two groups are the same. $H_1$: The medians of two groups are different.


In [123]:
scipy.stats.kruskal(free, paid)


Out[123]:
KruskalResult(statistic=187.92790428584749, pvalue=9.0081895155668011e-43)

Thus, we can reject the null hypothesis. It's significant that providing in-app purchases does influent the user experiences of an app, although the association needs further exploration.


In [ ]: